/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pdfbox.examples.util; import org.apache.pdfbox.exceptions.InvalidPasswordException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; import org.apache.pdfbox.pdmodel.common.PDRectangle; import java.io.IOException; import java.util.List; import java.util.LinkedList; import java.lang.Math; import java.util.regex.Pattern; /** * This is an example on how to get some x/y coordinates of text. * * Usage: java org.apache.pdfbox.examples.util.PrintWordLocations <input-pdf> * * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a> * @version $Revision: 1.7 $ */ public class PrintWordLocations extends PDFTextStripper { public class MarginOffset { protected float _dx; protected float _dy; public MarginOffset(float dx, float dy) { _dx = dx; _dy = dy; } public float getX() { return _dx; } public float getY() { return _dy; } } public class WordBoxEmitter { protected LinkedList<WordBox> box_list = new LinkedList<WordBox>(); protected Character last_character = new Character('\0'); protected StringBuffer word = new StringBuffer(""); protected MarginOffset _offset = new MarginOffset(0, 0); public void setOffset(MarginOffset offset) { _offset = offset; } protected boolean endsWord(char ch) { return !(isAlnumOrApostrophe(ch) || isHyphen(ch)); } protected boolean isAlnumOrApostrophe(char ch) { return Character.isLetterOrDigit(ch) || (ch == '\''); } protected boolean isHyphen(char ch) { return ch == '-'; } protected void emit() { float pointsToInch1200 = (float)16.6666; float mysteryHeightScale = (float)1.5; float height; float width; float hpos; float vpos; if (word.toString().trim().length() > 0) { for (WordBox wordbox : box_list) { width = wordbox._width * pointsToInch1200; height = wordbox._height * pointsToInch1200 * mysteryHeightScale; hpos = (wordbox._xmin + _offset.getX()) * pointsToInch1200; vpos = (wordbox._ymin + _offset.getY()) * pointsToInch1200 - height; System.out.println( "<String HEIGHT=\"" + height + "\" WIDTH=\"" + width + "\" HPOS=\"" + hpos + "\" VPOS=\"" + vpos + "\" CONTENT=\"" + word.toString().trim() + "\"/>" ); } } word = new StringBuffer(""); last_character = new Character('\0'); box_list.clear(); } protected void processTextPosition(TextPosition text, MarginOffset offset) { Character current_character = text.getCharacter().toLowerCase().charAt(0); setOffset(offset); if (endsWord(current_character)) { emit(); } else { if (box_list.size() == 0) { box_list.addLast(new WordBox(text)); } else if (box_list.getLast().accepts(text)) { box_list.getLast().extendBy(text); } else { if (!isHyphen(last_character)) { emit(); } box_list.addLast(new WordBox(text)); last_character = new Character('\0'); } if (isHyphen(last_character)) { word = word.append(last_character); } if (isAlnumOrApostrophe(current_character)) { word = word.append(current_character); } } last_character = current_character; } protected void endOfPage() { if (box_list.size() > 0) { emit(); } } } public class WordBox { public float _xmin; public float _ymin; public float _fontsize; public float _xscale; public float _yscale; public float _height; public float _width; public WordBox(TextPosition text) { _xmin = text.getXDirAdj(); _ymin = text.getYDirAdj(); _fontsize = text.getFontSize(); _xscale = text.getXScale(); _yscale = text.getYScale(); _height = text.getHeightDir(); _width = text.getWidthDirAdj(); } public boolean rejects(TextPosition text) { return (text.getXDirAdj() < _xmin) || (text.getYDirAdj() + text.getWidthOfSpace() < _ymin); } public boolean accepts(TextPosition text) { return !rejects(text); } public void extendBy(TextPosition text) { float current_xmin = _xmin; float current_xmax = _xmin + _width; float current_ymin = _ymin; float current_ymax = _ymin + _height; float text_xmin = text.getXDirAdj(); float text_xmax = text_xmin + text.getWidthDirAdj(); float text_ymin = text.getYDirAdj(); float text_ymax = text_ymin + text.getHeightDir(); float new_xmin = Math.min(current_xmin, text_xmin); float new_xmax = Math.max(current_xmax, text_xmax); float new_ymin = Math.min(current_ymin, text_ymin); float new_ymax = Math.max(current_ymax, text_ymax); _xmin = new_xmin; _width = new_xmax - new_xmin; _ymin = new_ymin; _height = new_ymax - new_ymin; } } protected WordBoxEmitter emitter = new WordBoxEmitter(); protected MarginOffset _offset = new MarginOffset(0, 0); /** * Default constructor. * * @throws IOException If there is an error loading text stripper properties. */ public PrintWordLocations() throws IOException { super.setSortByPosition( true ); } public void setOffset(MarginOffset offset) { _offset = offset; } public void processDocuments( String[] args ) throws Exception { if( args.length != 1 ) { usage(); } else { PDDocument document = null; try { document = PDDocument.load( args[0] ); if( document.isEncrypted() ) { try { document.decrypt( "" ); } catch( InvalidPasswordException e ) { System.err.println( "Error: Document is encrypted with a password." ); System.exit( 1 ); } } PrintWordLocations printer = new PrintWordLocations(); List allPages = document.getDocumentCatalog().getAllPages(); System.out.println( "<?xml version=\"1.0\" encoding=\"UTF-8\"?><alto xmlns=\"http://www.loc.gov/standards/alto/alto-v2.0.xsd\"><Description><MeasurementUnit>inch1200</MeasurementUnit></Description><Layout>" ); for( int i=0; i<allPages.size(); i++ ) { PDPage page = (PDPage)allPages.get( i ); if (page.getCropBox() != null) { PDRectangle mediaBox = (PDRectangle)page.getMediaBox(); PDRectangle cropBox = (PDRectangle)page.getCropBox(); printer.setOffset(new MarginOffset( cropBox.getLowerLeftX() - mediaBox.getLowerLeftX(), cropBox.getLowerLeftY() - mediaBox.getLowerLeftY() )); } System.out.println( "<Page>" ); System.out.println( "<PrintSpace>" ); System.out.println( "<TextBlock>" ); System.out.println( "<TextLine>" ); PDStream contents = page.getContents(); if( contents != null ) { printer.processStream( page, page.findResources(), page.getContents().getStream() ); } endOfPage(); System.out.println( "</TextLine>" ); System.out.println( "</TextBlock>" ); System.out.println( "</PrintSpace>" ); System.out.println( "</Page>"); } System.out.println( "</Layout></alto>" ); } finally { if( document != null ) { document.close(); } } } } /** * This will print the documents data. * * @param args The command line arguments. * * @throws Exception If there is an error parsing the document. */ public static void main( String[] args ) throws Exception { PrintWordLocations handler = new PrintWordLocations(); handler.processDocuments(args); } /** * A method provided as an event interface to allow a subclass to perform * some specific functionality when text needs to be processed. * * @param text The text to be processed */ protected void processTextPosition( TextPosition text ) { emitter.processTextPosition(text, _offset); } protected void endOfPage() { emitter.endOfPage(); } /** * This will print the usage for this document. */ private static void usage() { System.err.println( "Usage: java org.apache.pdfbox.examples.pdmodel.PrintWordLocations <input-pdf>" ); } }